In [2]:
!pip install wordcloud
Collecting wordcloud
  Downloading https://files.pythonhosted.org/packages/ae/af/849edf14d573eba9c8082db898ff0d090428d9485371cc4fe21a66717ad2/wordcloud-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (361kB)
    100% |████████████████████████████████| 368kB 34.4MB/s 
Requirement already satisfied: numpy>=1.6.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (1.14.5)
Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (5.2.0)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.5.0
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [3]:
import pandas as pd
import numpy as np
import scipy.stats as scs
import statsmodels.api as sm
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline
%config InlineBackend.figure_format='retina'
In [4]:
df = pd.read_csv('small_descr_clm_code.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df.head()
Out[4]:
descr clm code
0 CROSS-REFERENCE TO RELATED APPLICATIONS \n ... 1. A computer-implemented method of designing ... 706
1 RELATED APPLICATIONS \n This application i... What is claimed is: \n \n 1 . A sy... 705
2 CROSS REFERENCE TO RELATED APPLICATION \n ... 1. A weather information display device compri... 706
3 TECHNICAL FIELD \n The present disclosure ... 1 . A method of obtaining a user's measure... 705
4 CROSS-REFERENCE TO RELATED APPLICATIONS \n ... 1 . A method for providing borrower foreclosur... 705
In [5]:
df['descr_clm'] = df.descr + df.clm
df.drop(['descr','clm'],axis=1, inplace=True)
df['code'] = df['code'].astype('category')
In [6]:
df.head()
Out[6]:
code descr_clm
0 706 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
1 705 RELATED APPLICATIONS \n This application i...
2 706 CROSS REFERENCE TO RELATED APPLICATION \n ...
3 705 TECHNICAL FIELD \n The present disclosure ...
4 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...

Word Cloud for 705

In [9]:
df_705 = df[df['code']==705]
In [10]:
df_705.head()
Out[10]:
code descr_clm
1 705 RELATED APPLICATIONS \n This application i...
3 705 TECHNICAL FIELD \n The present disclosure ...
4 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
5 705 CROSS REFERENCE TO OTHER APPLICATIONS \n T...
6 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
In [24]:
custom_stopword_list = ['system','process','method','one', 'may','claim','embodiment','invention','include', 'example', 'include','step','figure','fig']

Add custom list of words to stop words

stopwords is a set so to add a list of words into that set I can use set.add() for an element but that didn't work for a list. I used set |= set(list) . it is a Union function.

In [25]:
stopwords = STOPWORDS
stopwords |= set(custom_stopword_list)
In [26]:
text = df.descr_clm.values

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stopwords).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
In [29]:
!pip install gensim
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-29-130211590d78> in <module>()
----> 1 get_ipython().system('pip install gensim')

~/anaconda3/envs/python3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in system_piped(self, cmd)
   2259         # a non-None value would trigger :func:`sys.displayhook` calls.
   2260         # Instead, we store the exit_code in user_ns.
-> 2261         self.user_ns['_exit_code'] = system(self.var_expand(cmd, depth=1))
   2262 
   2263     def system_raw(self, cmd):

~/anaconda3/envs/python3/lib/python3.6/site-packages/IPython/utils/_process_posix.py in system(self, cmd)
    154                 child = pexpect.spawnb(self.sh, args=['-c', cmd]) # Pexpect-U
    155             else:
--> 156                 child = pexpect.spawn(self.sh, args=['-c', cmd])  # Vanilla Pexpect
    157             flush = sys.stdout.flush
    158             while True:

~/anaconda3/envs/python3/lib/python3.6/site-packages/pexpect/pty_spawn.py in __init__(self, command, args, timeout, maxread, searchwindowsize, logfile, cwd, env, ignore_sighup, echo, preexec_fn, encoding, codec_errors, dimensions, use_poll)
    202             self.name = '<pexpect factory incomplete>'
    203         else:
--> 204             self._spawn(command, args, preexec_fn, dimensions)
    205         self.use_poll = use_poll
    206 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pexpect/pty_spawn.py in _spawn(self, command, args, preexec_fn, dimensions)
    301 
    302         self.ptyproc = self._spawnpty(self.args, env=self.env,
--> 303                                      cwd=self.cwd, **kwargs)
    304 
    305         self.pid = self.ptyproc.pid

~/anaconda3/envs/python3/lib/python3.6/site-packages/pexpect/pty_spawn.py in _spawnpty(self, args, **kwargs)
    312     def _spawnpty(self, args, **kwargs):
    313         '''Spawn a pty and return an instance of PtyProcess.'''
--> 314         return ptyprocess.PtyProcess.spawn(args, **kwargs)
    315 
    316     def close(self, force=True):

~/anaconda3/envs/python3/lib/python3.6/site-packages/ptyprocess/ptyprocess.py in spawn(cls, argv, cwd, env, echo, preexec_fn, dimensions)
    220 
    221         if use_native_pty_fork:
--> 222             pid, fd = pty.fork()
    223         else:
    224             # Use internal fork_pty, for Solaris

~/anaconda3/envs/python3/lib/python3.6/pty.py in fork()
     95 
     96     master_fd, slave_fd = openpty()
---> 97     pid = os.fork()
     98     if pid == CHILD:
     99         # Establish a new session.

OSError: [Errno 12] Cannot allocate memory
In [28]:
import io
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
from PIL import Image
import PIL.ImageOps
import random
from wordcloud import ImageColorGenerator
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-28-3109c748ad7c> in <module>()
      1 import io
      2 from nltk.tokenize import RegexpTokenizer
----> 3 from gensim import corpora, models
      4 import gensim
      5 from PIL import Image

ModuleNotFoundError: No module named 'gensim'
In [ ]:
 
In [ ]:
 
In [12]:
comment_words = ' '
stopwords = set(STOPWORDS) 
  
# iterate through the csv file 
for val in df_705.descr_clm: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-37a7ace8ee5d> in <module>()
     16 
     17     for words in tokens:
---> 18         comment_words = comment_words + words + ' '
     19 
     20 

KeyboardInterrupt: 
In [ ]: